1   /*
2    * Copyright (C) 2013 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.base;
18  
19  import com.google.common.annotations.GwtCompatible;
20  
21  import junit.framework.TestCase;
22  
23  /**
24   * Unit tests for {@link Utf8}.
25   *
26   * @author Jon Perlow
27   * @author Martin Buchholz
28   * @author Clément Roux
29   */
30  @GwtCompatible(emulated = true)
31  public class Utf8Test extends TestCase {
32    public void testEncodedLength_validStrings() {
33      assertEquals(0, Utf8.encodedLength(""));
34      assertEquals(11, Utf8.encodedLength("Hello world"));
35      assertEquals(8, Utf8.encodedLength("Résumé"));
36      assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
37          + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
38          + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
39          + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
40          + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
41          + "哈都拕人翻譯做好多話。"));
42      // A surrogate pair
43      assertEquals(4, Utf8.encodedLength(
44          newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
45    }
46  
47    public void testEncodedLength_invalidStrings() {
48      testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
49      testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
50      testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
51      testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
52      testEncodedLengthFails(
53          newString(
54              Character.MIN_HIGH_SURROGATE,
55              Character.MIN_HIGH_SURROGATE), 0);
56    }
57  
58    private static void testEncodedLengthFails(String invalidString,
59        int invalidCodePointIndex) {
60      try {
61        Utf8.encodedLength(invalidString);
62        fail();
63      } catch (IllegalArgumentException expected) {
64        assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
65            expected.getMessage());
66      }
67    }
68  
69    // 128 - [chars 0x0000 to 0x007f]
70    private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
71        0x007f - 0x0000 + 1;
72  
73    // 128
74    private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
75        ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
76  
77    // 1920 [chars 0x0080 to 0x07FF]
78    private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
79        0x07FF - 0x0080 + 1;
80  
81    // 18,304
82    private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
83        // Both bytes are one byte characters
84        (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
85        // The possible number of two byte characters
86        TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
87  
88    // 2048
89    private static final long THREE_BYTE_SURROGATES = 2 * 1024;
90  
91    // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
92    private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
93        0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
94  
95    // 2,650,112
96    private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
97        // All one byte characters
98        (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
99        // One two byte character and a one byte character
100       2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
101           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
102        // Three byte characters
103       THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
104 
105   // 1,048,576 [chars 0x10000L to 0x10FFFF]
106   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
107       0x10FFFF - 0x10000L + 1;
108 
109   // 289,571,839
110   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
111       // All one byte characters
112       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
113       // One and three byte characters
114       2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
115           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
116       // Two two byte characters
117       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
118       // Permutations of one and two byte characters
119       3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
120           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
121           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
122       // Four byte characters
123       FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
124 
125   /**
126    * Tests that round tripping of a sample of four byte permutations work.
127    * All permutations are prohibitively expensive to test for automated runs.
128    * This method tests specific four-byte cases.
129    */
130   public void testIsWellFormed_4BytesSamples() {
131     // Valid 4 byte.
132     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
133     // Bad trailing bytes
134     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
135     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
136     // Special cases for byte2
137     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
138     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
139   }
140 
141   /** Tests some hard-coded test cases. */
142   public void testSomeSequences() {
143     // Empty
144     assertWellFormed();
145     // One-byte characters, including control characters
146     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
147     // Two-byte characters
148     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
149     // Three-byte characters
150     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
151     // Four-byte characters
152     // "\u024B62\u024B62"
153     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
154     // Mixed string
155     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
156     assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
157         0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
158         0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
159     // Not a valid string
160     assertNotWellFormed(-1, 0, -1, 0);
161   }
162 
163   public void testShardsHaveExpectedRoundTrippables() {
164     // A sanity check.
165     long actual = 0;
166     for (long expected : generateFourByteShardsExpectedRunnables()) {
167       actual += expected;
168     }
169     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
170   }
171 
172   private String newString(char... chars) {
173     return new String(chars);
174   }
175 
176   private byte[] toByteArray(int... bytes) {
177     byte[] realBytes = new byte[bytes.length];
178     for (int i = 0; i < bytes.length; i++) {
179       realBytes[i] = (byte) bytes[i];
180     }
181     return realBytes;
182   }
183 
184   private void assertWellFormed(int... bytes) {
185     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
186   }
187 
188   private void assertNotWellFormed(int... bytes) {
189     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
190   }
191 
192   private static long[] generateFourByteShardsExpectedRunnables() {
193     long[] expected = new long[128];
194     // 0-63 are all 5300224
195     for (int i = 0; i <= 63; i++) {
196       expected[i] = 5300224;
197     }
198     // 97-111 are all 2342912
199     for (int i = 97; i <= 111; i++) {
200      expected[i] = 2342912;
201     }
202     // 113-117 are all 1048576
203     for (int i = 113; i <= 117; i++) {
204       expected[i] = 1048576;
205     }
206     // One offs
207     expected[112] = 786432;
208     expected[118] = 786432;
209     expected[119] = 1048576;
210     expected[120] = 458752;
211     expected[121] = 524288;
212     expected[122] = 65536;
213     // Anything not assigned was the default 0.
214     return expected;
215   }
216 }
217